1. Load Data
2. Extract Menu Entities from Reviews
- Visual Example of Entity Patterns & Extraction
- Combine Entities with Review Data
- Count Menu Items & Categories
- Extract Menu Entities from Reviews
3. Questions Displayed Visually
- Are there more menu item mentions in negative or positive reviews?
- Do average menu item mentions differ by platform?
- Which menu sub-categories are frequently mentioned in reviews?
- Do menu item frequencies change over time?
- Have menu item frequencies per platform changed over time?
- Which sides are being mentioned the most?
- Have the sides mentioned in reviews changed over time?
- Has the relative percentage of negative reviews changed over time?
4. Geospatial Sentiment Visualization
- Load & Visualize Shape File
- Link Review Data to Shape Coordinates
- Aggregate Columns before Grouping by District
- Group by District & Calculate Ratios/ Percentages
- Load Geojson & Merge with Shape File
5. Plotly Chloropleth
6. GeoPandas Chloropleth
- Prepare Data for Tableau
%load_ext autoreload
%autoreload 2
%matplotlib inline
from libraries_bert import *
np.random.seed(0)
sns.set_style('white')
with open('G:\\aws_processed\\info.json', 'r') as f:
d = json.load(f)
username = d['plotly_username']
api_key = d['plotly_api_key']
auth = HTTPBasicAuth(username, api_key)
headers = {'Plotly-Client-Platform': 'python'}
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')
data = pd.read_json('restaurant1_english_sentiment.json', orient='columns',
convert_dates=True)
location = pd.read_json('locations.json', orient='columns')
location.drop(['address_line_1','address_line_2'], inplace=True, axis=1)
# Get relevant columns from dataset
data_cut = data[['review_id','store_id','platform_name', 'clean_text','combined',
'bert_label', 'rating', 'bert_pos','bert_neg', 'date']].copy()
# merge location and review data
full_data = data_cut.merge(location, on='store_id', how='left')
# split review data by sentiment
full_data_pos = full_data.loc[full_data.bert_label == 'pos'].copy()
full_data_neg = full_data.loc[full_data.bert_label == 'neg'].copy()
full_data_pos.reset_index(inplace=True, drop=True)
full_data_neg.reset_index(inplace=True, drop=True)
len(full_data_pos), len(full_data_neg)
# Identifying location info has not been outputted to ensure privacy.
full_data_pos[['review_id', 'store_id', 'platform_name', 'clean_text', 'combined',
'bert_label', 'rating', 'bert_pos', 'bert_neg', 'date',
'country', 'city', 'brand_id']].head()
Match patterns were created with SpaCy and used with their EntityRulermethod to extract the menu items from the review text. The flexibility of their patterns - the ability to use parts-of-speech, to create complex matches with lists of possible words, and the option to exclude certain words, makes this a powerful option.
The patterns were created from the menu and adjusted to include the most common mispellings and phrasings. Around 3000 reviews were manually inspected using Prodigy annotation software.
match_patterns = []
with open(".../menu_seeds2.jsonl", "r") as read_file:
for line in read_file:
match_patterns.append(json.loads(line))
match_patterns[:5]
nlp = en_core_web_lg.load()
nlp.remove_pipe("ner")
ruler = EntityRuler(nlp)
ruler.add_patterns(match_patterns)
nlp.add_pipe(ruler)
# settings for displacy visualizer
colors = {"MENU": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["MENU"], "colors": colors}
# visualize the entities highlighted in the text and
# the associated ids (i.e., menu subcategories)
test_text = full_data_neg.clean_text[:50]
for doc in list(nlp.pipe(test_text)):
if len(doc.ents) >= 1:
for ent in doc.ents:
print(ent.text + ' --> ' + ent.ent_id_)
displacy.render(doc, style="ent", options=options)
print('-'*100)
%%time
def get_ents(df, col, patterns):
"""
Return a dictionary of menu entities from SpaCy EntityRuler,
based on match patterns.
params:
df: Dataframe with text reviews.
col: Column to apply the EntityRuler on.
patterns: Entity patterns to use for extraction.
"""
nlp = en_core_web_lg.load()
nlp.remove_pipe("ner")
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
menu_ents = []
for r_id, doc in zip(df.review_id.tolist(), list(nlp.pipe(df[col].tolist()))):
ent_ids = []
ent_text = []
for ent in doc.ents:
ent_ids.append(ent.ent_id_)
ent_text.append(ent.text)
menu_ents.append({"text": ent_text, "id": ent_ids, "review_id": r_id})
return menu_ents
pos_menu_ents = get_ents(full_data_pos, 'clean_text', match_patterns)
neg_menu_ents = get_ents(full_data_neg, 'clean_text', match_patterns)
len(pos_menu_ents), len(neg_menu_ents)
def get_menu_lists(ent_dict):
"""Extract lists of menu items and menu categories
from the entity dict."""
menu_items = []
menu_cats = []
for review in ent_dict:
menu_items.append(review['text'])
menu_cats.append(review['id'])
return menu_items, menu_cats
menu_items_pos, menu_cats_pos = get_menu_lists(pos_menu_ents)
menu_items_neg, menu_cats_neg = get_menu_lists(neg_menu_ents)
full_data_pos['menu_items'] = pd.Series(menu_items_pos)
full_data_pos['menu_cats'] = pd.Series(menu_cats_pos)
full_data_neg['menu_items'] = pd.Series(menu_items_neg)
full_data_neg['menu_cats'] = pd.Series(menu_cats_neg)
full_data_neg[['menu_items', 'menu_cats']].head()
full_data_neg['num_items'] = [len(full_data_neg['menu_items'][i]) for i in full_data_neg.index]
full_data_neg['num_cats'] = [len(set(full_data_neg['menu_cats'][i])) for i in full_data_neg.index]
# menu categories are only counted once per review
full_data_pos['num_items'] = [len(full_data_pos['menu_items'][i]) for i in full_data_pos.index]
full_data_pos['num_cats'] = [len(set(full_data_pos['menu_cats'][i])) for i in full_data_pos.index]
full_data_neg[['clean_text','menu_items', 'menu_cats', 'num_items', 'num_cats']].head()
def plot_comparison(df1, df2, col, normalize=True, title=None):
"""Plot comparison for the same feature in two separate dataframes."""
d1 = {}
d2 = {}
if normalize:
d1['no categories'] = df1[col].value_counts(normalize=True)[0]*100
d1['categories'] = sum(df1[col].value_counts(normalize=True)[1:-1]*100)
d2['no categories'] = df2[col].value_counts(normalize=True)[0]*100
d2['categories'] = sum(df2[col].value_counts(normalize=True)[1:-1]*100)
else:
d1['no_cats'] = df1[col].value_counts()[0]
d1['cats'] = sum(df1[col].value_counts()[1:-1])
d2['no_cats'] = df2[col].value_counts()[0]
d2['cats'] = sum(df2[col].value_counts()[1:-1])
X = np.arange(len(d1))
fig = plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
ax.bar(X, d1.values(), width=0.2, color='powderblue', align='center')
ax.bar(X-0.2, d2.values(), width=0.2, color='salmon', align='center', alpha=0.8)
ax.legend(('Positive Reviews','Negative Reviews'))
plt.xticks(X, d1.keys(), fontsize=12)
plt.title(title, fontsize=17)
plot_comparison(full_data_pos, full_data_neg, 'num_items', normalize=True,
title = '% of Reviews that Mention Menu Items')
fig = plt.figure(figsize=(12, 6))
full_data_neg.groupby(['platform_name'])['num_items'].mean()\
.sort_values()\
.plot(kind='bar')
plt.xticks(np.arange(7), rotation=0, fontsize=12)
plt.xlabel(None)
plt.title('Average Number of Named Items per Negative Review by Platform',
fontsize=14);
fig = plt.figure(figsize=(12, 6))
full_data_pos.groupby(['platform_name'])['num_items'].mean()\
.sort_values()\
.plot(kind='bar')
plt.xticks(np.arange(7), rotation=0, fontsize=12)
plt.xlabel(None)
plt.title('Average Number of Named Items per Positive Review by Platform',
fontsize=14);
def get_category_freq(df, col):
"""Create a dictionary of category frequencies."""
cats_text = df[col].tolist().copy()
cats_tokens = list(itertools.chain.from_iterable(cats_text))
fdist = FreqDist(cats_tokens)
cats_freq = dict()
for word, frequency in fdist.items():
cats_freq[word] = frequency
return cats_freq
neg_cats_freq = get_category_freq(full_data_neg, 'menu_cats')
pos_cats_freq = get_category_freq(full_data_pos, 'menu_cats')
all_cats_freq = {k: neg_cats_freq.get(k, 0) + pos_cats_freq.get(k, 0) for k in set(neg_cats_freq) | set(pos_cats_freq)}
def plot_menu_cats(freq_dict, title=None, normalize=True):
"""Plot category frequencies from a frequency dict."""
df = pd.DataFrame(list(freq_dict.items()), columns=['categories', 'frequency'])
if normalize:
df['percentage'] = (df.frequency/sum(df.frequency))*100
# sort df by Count column
df = df.sort_values(['percentage'], ascending=False).reset_index(drop=True)
plt.figure(figsize=(12,10))
# plot barh chart with index as x values
ax = sns.barplot(x="percentage", y="categories", data=df)
ax.set_xlabel("Percentage of Menu Category Mentions in Reviews", fontsize=12)
else:
# sort df by Count column
df = df.sort_values(['frequency'], ascending=False).reset_index(drop=True)
plt.figure(figsize=(12,10))
# plot barh chart with index as x values
ax = sns.barplot(x="frequency", y="categories", data=df)
ax.set_xlabel("Frequency of Menu Category Mentions in Reviews", fontsize=12)
ax.set_ylabel('Menu Categories', fontsize=12)
ax.tick_params(axis="y", labelsize=12)
ax.set_title(label=title, fontsize=14);
plot_menu_cats(neg_cats_freq, title='Percentage of Menu Category Mentions in Negative Reviews',
normalize=False)
plot_menu_cats(pos_cats_freq, title='Percentage of Menu Category Mentions in Positive Reviews',
normalize=False)
plot_menu_cats(all_cats_freq, title='Percentage of Menu Category Mentions in All Reviews',
normalize=True)
def plotly_plot(df_cut, title, yaxis=None, xaxis=None, filename=None):
"""Plot an interactive plotly chart from a specified dataframe."""
fig = go.Figure()
for col in df_cut.columns:
fig.add_trace(go.Scatter(
x=df_cut.index,
y=df_cut[col],
name=col))
fig.update_layout(
title=title,
xaxis_title='Time',
yaxis_title=yaxis,
font=dict(
family="Courier New, monospace",
size=18,
color="#7f7f7f"))
fig.show()
if filename:
url=py.iplot(fig, filename=filename, sharing='public', auto_open=False)
plotly.offline.plot(fig, filename=filename + '.html', auto_open=False)
def plot_category_trends(data, item_col, cat_cols, filename=None,
title=None, normalize=True,
year_start='2011', year_end='2019', time='Y'):
"""Plot time series data for categories."""
start = year_start + '-12-31'
end = year_end + '-12-31'
df = data.set_index(pd.to_datetime(data.date).dt.normalize(), drop=True)
mask = (df['date'] > start) & (df['date'] <= end)
df = df.loc[mask]
df.drop('date', inplace=True, axis=1)
df = df[item_col].str.join('|').str.get_dummies()
if normalize:
year_gr = df.resample(time).sum()
yaxis = 'Percentage'
# taken as a % of reviews in that year
year_gr['total'] = year_gr.sum(axis=1)
for col in year_gr.columns:
year_gr[col] = round((year_gr[col]/year_gr['total'])*100, 2)
else:
year_gr = df.resample(time).sum()
yaxis='Frequency'
df_cut = year_gr[cat_cols].copy()
plotly_plot(df_cut, title, yaxis, filename)
plot_category_trends(full_data_pos, 'menu_cats',
['breakfast', 'side', 'steaks', 'burger', 'ribs',
'chicken', 'dessert', 'sauces','specials'],
#filename='menu_cats_positive.png',
title = "Trends for Menu Categories in Positive Reviews",
normalize=True, year_start='2016', year_end='2018', time='M')
plot_category_trends(full_data_neg, 'menu_cats',
['side', 'steaks', 'burger', 'breakfast', 'ribs', 'chicken',
'dessert','sauces', 'specials'],
title = "Trends for Menu Categories in Negative Reviews",
year_start='2015', time='Y')
def reset_index(df):
'''Returns DataFrame with index as columns.'''
index_df = df.index.to_frame(index=False)
df = df.reset_index(drop=True)
# In merge it's important the order in which you pass the dataframes
# if the index contains a Categorical.
# pd.merge(df, index_df, left_index=True, right_index=True) does not work
df = pd.merge(index_df, df, left_index=True, right_index=True)
df = df.set_index(pd.to_datetime(df.date).dt.normalize(), drop=True).copy()
df.drop('date', inplace=True, axis=1)
return df
def plot_platforms(df, title=None, normalize=True,
year_start='2011', year_end='2019',
time='Y'):
""" Plot time series data for review platforms."""
start = year_start + '-12-31'
end = year_end + '-12-31'
df['platform_name'] = df['platform_name'].astype('category')
platforms = df.set_index(pd.to_datetime(df.date).dt.normalize(), drop=True).copy()
mask = (platforms['date'] > start) & (platforms['date'] <= end)
platforms = platforms.loc[mask]
platforms.drop('date', inplace=True, axis=1)
pl_df = platforms.pivot_table(index='date', columns='platform_name',
values='num_cats', aggfunc='sum')
pl_df = reset_index(pl_df)
pl_df = pl_df.resample(time).sum()
pl_df['total'] = pl_df.sum(axis=1)
if normalize:
for col in pl_df.columns[:-2]:
pl_df[col] = round((pl_df[col]/pl_df['total'])*100, 2)
fig = go.Figure()
for col in pl_df.columns[:-2]:
fig.add_trace(go.Scatter(
x=pl_df.index,
y=pl_df[col],
name=col))
fig.update_layout(
title=title,
xaxis_title="Years",
yaxis_title="Percentage",
font=dict(
family="Courier New, monospace",
size=16,
color="#7f7f7f"))
fig.show()
plot_platforms(full_data_pos, title = "Menu Category Details in Positive Reviews by Platform",
normalize=True, year_end='2018')
neg_sides = full_data_neg[full_data_neg.menu_cats.map(set(['side']).issubset)]
pos_sides = full_data_pos[full_data_pos.menu_cats.map(set(['side']).issubset)]
# create a lookup dictionary to group synonyms (for example: chips, fries, and
# potato chips are all referring to the same item - chips)
subs= {'hot veg': ['vegies', 'cooked vegetables', 'veggies','veg', 'vegetables','hot veg',
'veggie', 'hot veggies','side veg','vegetable', 'side of vegetables',
'cooked veggies','hot vegetables', 'side vegetables', 'side of veggies',
'hot veggie', 'carrot', 'hot vegies', 'side veggies', 'cooked veg'],
'baked potato': ['baked potato', 'baked potatoes', 'bake potato', 'jacket potato'],
'cream spinach': ['spinach', 'creamed spinach', 'cream spinach'],
'butternut': ['pumpkin', 'butternut'],
'onion rings': ['onion rings', 'ring onions', 'onions rings','onion ring', 'onions ring',
'ring onion'],
'chips': ['chips','chip', 'fries', 'potato chips', 'potato fries'],
'sweet potato fries': ['sweet potato chips', 'sweet potato fries', 'sweetpotato chips'],
'side dish': ['side dish','side dishes'],
'toast': ['toast'],
'mushrooms': ['mushrooms'],
'side salad': ['side salad'],
'smileys': ['smileys']}
def invert_dict(d):
""" Return inverse dictionary.
Example: current_dict['chips'] = 'sweet potato fries',
inverse_dict['sweet potato fries'] = 'chips'.
"""
inverse = dict()
for key in d:
# Go through the list that is saved in the dict:
for item in d[key]:
# Check if in the inverted dict the key exists
if item not in inverse:
# If not create a new list
inverse[item] = [key]
else:
inverse[item].append(key)
return inverse
inv_subs = invert_dict(subs)
def flatten(A):
""" Flatten nested lists. """
rt = []
for i in A:
if isinstance(i,list): rt.extend(flatten(i))
else: rt.append(i)
return rt
def get_sides(df, inv_subs, title=None):
"""Using a lookup dict, return grouped sides."""
sides = []
for items, cats in zip(df.menu_items, df.menu_cats):
for item, cat in zip(items, cats):
if cat == 'side':
sides.append(item)
fdist = FreqDist(sides)
replaced_sides = [inv_subs.get(item,item) for item in sides]
flat_sides = flatten(replaced_sides)
fdist = FreqDist(flat_sides)
sides_freq = dict()
for word, frequency in fdist.items():
sides_freq[word] = frequency
plot_menu_cats(sides_freq, title=title)
get_sides(neg_sides, inv_subs, title= 'Sides Mentioned in Negative Reviews')
get_sides(pos_sides, inv_subs, title= 'Sides Mentioned in Positive Reviews')
def sides_df(df):
"""Return dataframe with the grouped sides per review."""
sides_sliced = df[df.menu_cats.map(set(['side']).issubset)]
sides_dicts = []
for items, r_id in zip(sides_sliced.menu_items, sides_sliced.review_id):
side_items = []
items_replaced = []
for item in items:
if item in inv_subs.keys():
side_items.append(item)
items_replaced = [inv_subs.get(item,item) for item in side_items]
flattened_items = flatten(items_replaced)
unique_items = list(set(flattened_items))
sides_dicts.append({'side_items': unique_items, 'review_id': r_id})
sides_df = pd.DataFrame.from_dict(sides_dicts)
sides_only = sides_df.merge(sides_sliced, on='review_id', how='left')
return sides_only
neg_sides_df = sides_df(full_data_neg)
plot_cat_trends(neg_sides_df, 'side_items', list(subs.keys()),
title = "Trends for Sides in Negative Reviews", normalize=True,
year_start='2014', year_end='2018', time='Y')
pos_sides_df = sides_df(full_data_pos)
plot_cat_trends(pos_sides_df, 'side_items', list(subs.keys()),
title = "Trends for Sides in Positive Reviews", normalize=True,
year_start='2009', year_end='2018', time='Y')
def plot_percent_negative(neg_df, pos_df, col, cat_cols=None,
year_start='2011', year_end='2019',
time = 'Y', normalize = 'category', title=None):
start = year_start + '-12-31'
end = year_end + '-12-31'
df = neg_df.set_index(pd.to_datetime(neg_df.date).dt.normalize(), drop=True)
mask = (df['date'] >= start) & (df['date'] <= end)
df = df.loc[mask]
df = df[col].str.join('|').str.get_dummies()
neg_year_gr = df.resample(time).sum()
df = pos_df.set_index(pd.to_datetime(pos_df.date).dt.normalize(), drop=True)
mask = (df['date'] >= start) & (df['date'] <= end)
df = df.loc[mask]
df = df[col].str.join('|').str.get_dummies()
pos_year_gr = df.resample(time).sum()
if normalize == 'category':
total = pd.DataFrame()
for col in cat_cols:
total[col] = pos_year_gr[col] + neg_year_gr[col]
neg_perc = pd.DataFrame()
for col in total.columns:
neg_perc[col] = round((neg_year_gr[col]/total[col])*100, 2)
elif normalize == 'year':
neg_year_gr['total'] = pos_year_gr.sum(axis=1) + neg_year_gr.sum(axis=1)
neg_perc = pd.DataFrame()
for col in cat_cols:
neg_perc[col] = round((neg_year_gr[col]/neg_year_gr['total'])*100, 2)
plotly_plot(neg_perc, title)
# Normalize by year - adjusting for overall frequency per year, how many
# negative reviews mentioned side dish subcategories
plot_percent_negative(neg_sides_df, pos_sides_df, 'side_items',
['butternut', 'chips', 'cream spinach', 'hot veg',
'mushrooms', 'onion rings'], year_start='2013',
year_end='2019', time='Y',
normalize='category',
title='% Negative - All Reviews Discussing Sides')
# Normalize by category
# Adjusting within subcategory (i.e., when compared to only itself)
# how have the negative reviews mentioning sides changed over the chosen timeframe?
plot_percent_negative(neg_sides_df, pos_sides_df, 'side_items',
['butternut', 'chips', 'cream spinach', 'hot veg',
'mushrooms', 'onion rings'], year_start='2013',
year_end='2019', time='Y',
normalize='year',
title='% Negative - All Reviews Discussing Sides')
# Normalized for category - Adjusting within food subcategory (i.e., comparing only to itself)
# how have the negative reviews mentioning food categories changed over the chosen timeframe?
plot_percent_negative(full_data_neg, full_data_pos, 'menu_cats',
cat_cols=['side', 'steaks', 'burger', 'breakfast',
'ribs', 'chicken', 'dessert', 'sauces', 'specials'],
year_start='2017', year_end='2019', time='M',
normalize='category',
title='% Negative - Total Reviews with Menu Categories')
# spatial file containing 51 districts
spatial_file = '...\\geospatial\\district\\layer1-2.shp'
map_df = gpd.read_file(spatial_file)
map_df.head()
map_df.plot(figsize=(10, 8));
# Only use stores in South African
sa_restaurants = full_data.loc[(full_data.country == 'ZA') | \
(full_data.country == 'South Africa')].copy()
# Create the geometry POINTS column from the coordinates
sa_restaurants["geometry"] = sa_restaurants.apply(lambda row: Point(row["longitude"], \
row["latitude"]), axis=1)
# Concatenate extracted menu and item data from above analysis
# and add to restaurant/ location data
full_data_pos_cut = full_data_pos[['review_id', 'menu_items', 'menu_cats',\
'num_items', 'num_cats']].copy()
full_data_neg_cut = full_data_neg[['review_id', 'menu_items', 'menu_cats',\
'num_items', 'num_cats']].copy()
full_data_cut = pd.concat([full_data_pos_cut, full_data_neg_cut])
places = sa_restaurants.merge(full_data_cut, on='review_id', how = 'left')
# Remove mixed sentiment reviews
places = places.loc[places.bert_label != 'mixed']
# Longitude and latitude columns no longer needed
del(places["latitude"], places["longitude"])
# Convert to geopandas dataframe and assign the geometry column
places_geo = geopandas.GeoDataFrame(places, geometry="geometry")
# Declare the coordinate system for the places GeoDataFrame
# GeoPandas doesn't do any transformations automatically when performing
# the spatial join. The layers are already in the same CRS (WGS84) so no
# transformation is needed.
places_geo.crs = {"init": "epsg:4326"}
# perform spatial join to link points to district polygons
geo_df = geopandas.tools.sjoin(map_df, places_geo, how="left")
geo_df.head()
# Look at specific item
def add_cat_freq(df, category, name):
cat = []
for cats in df['menu_cats']:
if category in set(cats):
cat.append(1)
else:
cat.append(0)
df[name] = cat
# choose subcategory to calculate frequency
add_cat_freq(full_data_pos, 'steaks', 'steak_pos')
add_cat_freq(full_data_neg, 'steaks', 'steak_neg')
# get relative columns
full_data_pos_cut = full_data_pos[['review_id', 'menu_items', 'menu_cats', 'num_items',
'num_cats', 'steak_pos']].copy()
full_data_neg_cut = full_data_neg[['review_id', 'menu_items', 'menu_cats', 'num_items',
'num_cats', 'steak_neg']].copy()
# merge with geospatial data (review + district data)
geo_neg = geo_df.merge(full_data_neg_cut, on='review_id', how = 'left')
geo_neg = geo_neg.loc[geo_neg.bert_label == 'neg']
geo_pos = geo_df.merge(full_data_pos_cut, on='review_id', how = 'left')
geo_pos = geo_pos.loc[geo_pos.bert_label == 'pos']
# indexes for districts with missing data
missing_idx = [4, 6, 8, 43]
def aggregate_columns(df, idx, col):
counts = df.groupby('FID')['review_id'].count().tolist()
cat_col = df.groupby('FID')[col].sum().tolist()
for i in missing_idx:
counts.insert(i, 0)
cat_col.insert(i, 0)
assert len(counts) == 52
assert len(cat_col) == 52
return counts, cat_col
neg_counts, steak_neg = aggregate_columns(geo_neg, missing_idx, 'steak_neg')
pos_counts, steak_pos = aggregate_columns(geo_pos, missing_idx, 'steak_pos')
to_drop = ['CATEGORY', 'CAT2', 'DISTRICT', 'MUNICNAME', 'MAP_TITLE', 'PROVINCE',
'PROVNAME', 'MapNo', 'MUN_CD', 'Shape_Leng', 'Shape_Area',
'geometry', 'index_right', 'review_id', 'store_id', 'platform_name',
'clean_text', 'combined', 'bert_label', 'bert_pos',
'bert_neg', 'date', 'country', 'city', 'brand_id']
# add stats columns for chosen subcategory after grouping by district
def add_stats(df, neg_cat, pos_cat):
df['prop_neg'] = (df[neg_cat]/df['neg_count'])*100
df['prop_pos'] = (df[pos_cat]/df['pos_count'])*100
df['percent'] = ((df[neg_cat] + df[pos_cat])/df['total_reviews'])*100
df['perc_neg'] = (df[neg_cat]/(df[neg_cat] + df[pos_cat]))*100
df['perc_pos'] = (df[pos_cat]/(df[neg_cat] + df[pos_cat]))*100
geo_df2 = geo_df.copy()
geo_df2.drop(to_drop, inplace=True, axis=1)
districts_geo_df = geo_df2.groupby('FID').sum().reset_index()
# calculate ratios and percentages on grouped data so that we get values per district
districts_geo_df['neg_count'] = neg_counts
districts_geo_df['pos_count'] = pos_counts
districts_geo_df['total_reviews'] = geo_df.groupby('FID')['review_id'].count().tolist()
districts_geo_df['pos_neg_ratio'] = (districts_geo_df['pos_count']/districts_geo_df['neg_count'])
districts_geo_df['neg_pos_ratio'] = (districts_geo_df['neg_count']/districts_geo_df['pos_count'])
districts_geo_df['rating'] = districts_geo_df['rating']/districts_geo_df['total_reviews']
# add stats on steaks subcategory
districts_geo_df['steak_pos'] = steak_pos
districts_geo_df['steak_neg'] = steak_neg
add_stats(districts_geo_df, 'steak_neg', 'steak_pos')
districts_geo_df.fillna(0, inplace=True)
districts_geo_df
# load geojson for Plotly
with open('...\\geospatial\\layer1-2.json') as response:
districts = json.load(response)
# merge with full data (processed geospatial + review + calculations)
merged = map_df.set_index('FID').join(districts_geo_df.set_index('FID'))
merged.reset_index(inplace=True)
merged.fillna(0, axis=0, inplace=True)
merged.head()
color = "Viridis"
fig = go.Figure(go.Choroplethmapbox(geojson=districts, #provinces, #
locations=merged.FID,
z=merged.pos_neg_ratio,
text=merged.MUNICNAME,
name='',
below='traces',
colorscale=color, zmin=0, zmax=15,
colorbar = dict(thickness=20, ticklen=3),
hovertemplate = '<b>District</b>: <b>%{text}</b>'+
'<br><b>Val </b>: %{z}<br>',
hoverinfo = 'text',
marker_opacity=0.7, marker_line_width=0.1))
fig.update_layout(mapbox = {
'style':"carto-positron",
'zoom': 5,
'center': {"lat": -30.5595, "lon": 22.9375},
'layers': [
{'source': {
'type': "FeatureCollection",
'sourcetype': 'geojson',
'features': districts['features']},
'type': 'fill', 'below': 'traces', 'color': 'lightgreen'}]})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
# set a variable that will call whatever column we want to visualise on the map
variable = 'perc_neg'
# set the range for the choropleth
vmin, vmax = 0, 50
# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(14, 10))
# create map
color = "Blues"
merged.plot(variable, cmap=color, linewidth=0.2, ax=ax, edgecolor='0.1', alpha=0.8)
ax.axis('off')
# add a title
ax.set_title('% of Reviews Mentioning Steaks that are Negative',
fontdict={'fontsize': '25', 'fontweight' : '3'})
# Create colorbar as a legend
sm = plt.cm.ScalarMappable(cmap=color, norm=plt.Normalize(vmin=vmin, vmax=vmax))
# empty array for the data range
sm._A = []
# add the colorbar to the figure
cbar = fig.colorbar(sm)
# Expand categories into multiple rows: for example, if one review
# has the categories: sides and steak, there will be two rows with
# the same review - one for sides and one for steak.
# create pandas dataframe containing geometery information
# to export into Tableau
geo_pd = pd.DataFrame(geo_df)
geo_pd.reset_index(inplace=True, drop=True)
def explode(df, lst_cols, fill_value='', preserve_index=False):
""" Transform each element of a list-like to a row,
replicating the index values and other fields.
Return dataframe with exploded category."""
# make sure `lst_cols` is list-alike
if (lst_cols is not None
and len(lst_cols) > 0
and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
lst_cols = [lst_cols]
# all columns except `lst_cols`
idx_cols = df.columns.difference(lst_cols)
# calculate lengths of lists
lens = df[lst_cols[0]].str.len()
# preserve original index values
idx = np.repeat(df.index.values, lens)
# create "exploded" DF
res = (pd.DataFrame({
col:np.repeat(df[col].values, lens)
for col in idx_cols},
index=idx)
.assign(**{col:np.concatenate(df.loc[lens>0, col].values)
for col in lst_cols}))
# append those rows that have empty lists
if (lens == 0).any():
# at least one list in cells is empty
res = (res.append(df.loc[lens==0, idx_cols], sort=False)
.fillna(fill_value))
# revert the original index order
res = res.sort_index()
# reset index if requested
if not preserve_index:
res = res.reset_index(drop=True)
return res
def expand_cats(df, col):
""" Preprocesses dataframe and explode chosen category.
Ensure null values are replaced with empty strings.
Ensure that a category is only exploded once for a
single sample."""
df[col].fillna('', inplace=True)
df[col] = df[col].apply(set)
df[col] = df[col].apply(list)
return explode(df, col)
geo_pd = expand_cats(geo_pd, 'menu_cats')
#geo_pd.to_csv('geospatial_reviews.csv')